{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Note:  This notebook takes a long time due to the amount of data involved.  Do not run this.  Use the more-scalable mechanisms presented in this material."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train an XGBoost Model with Jupyter Notebook\n",
    "We will train a custom XGBoost directly in this notebook to predict sentiment of Amazon customer reviews."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install -q boto3\n",
    "!pip install -q xgboost==0.90"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Copy the datasets from S3 to this notebook instance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r spark_processing_job_s3_output_prefix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Previous Spark Processing Job Name: {}'.format(spark_processing_job_s3_output_prefix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_train = '{}/output/tfidf-train'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_validation = '{}/output/tfidf-validation'.format(spark_processing_job_s3_output_prefix)\n",
    "prefix_test = '{}/output/tfidf-test'.format(spark_processing_job_s3_output_prefix)\n",
    "\n",
    "tfidf_train_path = './{}'.format(prefix_train)\n",
    "tfidf_validation_path = './{}'.format(prefix_validation)\n",
    "tfidf_test_path = './{}'.format(prefix_test)\n",
    "\n",
    "tfidf_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)\n",
    "tfidf_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)\n",
    "tfidf_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)\n",
    "\n",
    "import os\n",
    "os.makedirs(prefix_train, exist_ok=True)\n",
    "os.makedirs(prefix_validation, exist_ok=True)\n",
    "os.makedirs(prefix_test, exist_ok=True)\n",
    "\n",
    "tfidf_train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)\n",
    "tfidf_validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)\n",
    "tfidf_test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)\n",
    "\n",
    "print(tfidf_train_s3_uri)\n",
    "print(tfidf_validation_s3_uri)\n",
    "print(tfidf_test_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls $tfidf_train_s3_uri/ "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls $tfidf_validation_s3_uri/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls $tfidf_test_s3_uri/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!aws s3 cp --recursive $tfidf_train_s3_uri $tfidf_train_path\n",
    "!aws s3 cp --recursive $tfidf_validation_s3_uri $tfidf_validation_path\n",
    "!aws s3 cp --recursive $tfidf_test_s3_uri $tfidf_test_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob\n",
    "import pandas as pd\n",
    "\n",
    "def load_dataset(path, sep, header):\n",
    "    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)\n",
    "\n",
    "    labels = data.iloc[:,0]\n",
    "    features = data.drop(data.columns[0], axis=1)\n",
    "    \n",
    "    if header==None:\n",
    "        # Adjust the column names after dropping the 0th column above\n",
    "        # New column names are 0 (inclusive) to len(features.columns) (exclusive)\n",
    "        new_column_names = list(range(0, len(features.columns)))\n",
    "        features.columns = new_column_names\n",
    "\n",
    "    return features, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Balanced\n",
    "X_train, y_train = load_dataset(path=tfidf_train_path, sep=',', header=None)\n",
    "X_validation, y_validation = load_dataset(path=tfidf_validation_path, sep=',', header=None)\n",
    "X_test, y_test = load_dataset(path=tfidf_test_path, sep=',', header=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train the model\n",
    "_This will take a few minutes.  Please be patient._"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "objective  = 'binary:logistic'\n",
    "max_depth  = 5\n",
    "num_round  = 1\n",
    "\n",
    "xgb_estimator = XGBClassifier(objective=objective,\n",
    "                              num_round=num_round,\n",
    "                              max_depth=max_depth)\n",
    "\n",
    "xgb_estimator.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import pickle as pkl\n",
    "\n",
    "# See https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html\n",
    "# Need to save with joblib or pickle.  `xgb.save_model()` does not save feature_names\n",
    "model_dir  = './models/notebook/'\n",
    "\n",
    "os.makedirs(model_dir, exist_ok=True)\n",
    "model_path = os.path.join(model_dir, 'xgboost-model')\n",
    "\n",
    "pkl.dump(xgb_estimator, open(model_path, 'wb'))\n",
    "\n",
    "print('Wrote model to {}'.format(model_path))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Restore Model \n",
    "This simulates restoring a model within an application."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle as pkl\n",
    "import os\n",
    "\n",
    "model_dir  = './models/notebook/'\n",
    "model_path = os.path.join(model_dir, 'xgboost-model')\n",
    "\n",
    "xgb_estimator_restored = pkl.load(open(model_path, 'rb'))\n",
    "\n",
    "type(xgb_estimator_restored)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Plot the feature importance for this model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "import xgboost\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12,12))\n",
    "xgboost.plot_importance(xgb_estimator_restored, \n",
    "                        importance_type='gain', \n",
    "                        max_num_features=30, \n",
    "                        height=0.8, \n",
    "                        ax=ax, \n",
    "                        show_values = True)\n",
    "plt.title('Feature Importance')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate Validation Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "X_validation.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_validation = xgb_estimator_restored.predict(X_validation)\n",
    "preds_validation.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "preds_validation_0_or_1 = np.where(preds_validation > 0.5, 1, 0)\n",
    "preds_validation_0_or_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "print('Validation Accuracy: ', accuracy_score(y_validation, preds_validation_0_or_1))\n",
    "print('Validation Precision: ', precision_score(y_validation, preds_validation_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(preds_validation).head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_validation, preds_validation_0_or_1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_validation = confusion_matrix(y_validation, preds_validation_0_or_1)\n",
    "df_cm_validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import numpy as np\n",
    "\n",
    "def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):\n",
    "    print(cm)\n",
    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    tick_marks = np.arange(len(classes))\n",
    "    plt.xticks(tick_marks, classes, rotation=45)\n",
    "    plt.yticks(tick_marks, classes)\n",
    "\n",
    "    fmt = 'd'\n",
    "    thresh = cm.max() / 2.\n",
    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
    "        plt.text(j, i, format(cm[i, j], fmt),\n",
    "        horizontalalignment=\"center\",\n",
    "        color=\"black\" if cm[i, j] > thresh else \"black\")\n",
    "\n",
    "        plt.tight_layout()\n",
    "        plt.ylabel('True label')\n",
    "        plt.xlabel('Predicted label')\n",
    "\n",
    "# Plot non-normalized confusion matrix\n",
    "plt.figure()\n",
    "fig, ax = plt.subplots(figsize=(10,5))\n",
    "plot_conf_mat(df_cm_validation, classes=['Not Positive Sentiment', 'Positive Sentiment'], \n",
    "                          title='Confusion matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "#print(metrics.f1_score(y_validation, preds_validation))\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_validation, preds_validation_0_or_1), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_validation, preds_validation_0_or_1)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate Test Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_test = xgb_estimator_restored.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "preds_test_0_or_1 = np.where(preds_test > 0.5, 1, 0)\n",
    "preds_test_0_or_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "print('Test Accuracy: ', accuracy_score(y_test, preds_test_0_or_1))\n",
    "print('Test Precision: ', precision_score(y_test, preds_test_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(classification_report(y_test, preds_test_0_or_1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_test = confusion_matrix(y_test, preds_test_0_or_1)\n",
    "df_cm_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):\n",
    "    print(cm)\n",
    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    tick_marks = np.arange(len(classes))\n",
    "    plt.xticks(tick_marks, classes, rotation=45)\n",
    "    plt.yticks(tick_marks, classes)\n",
    "\n",
    "    fmt = 'd'\n",
    "    thresh = cm.max() / 2.\n",
    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
    "        plt.text(j, i, format(cm[i, j], fmt),\n",
    "        horizontalalignment=\"center\",\n",
    "        color=\"black\" if cm[i, j] > thresh else \"black\")\n",
    "\n",
    "        plt.tight_layout()\n",
    "        plt.ylabel('True label')\n",
    "        plt.xlabel('Predicted label')\n",
    "\n",
    "# Plot non-normalized confusion matrix\n",
    "plt.figure()\n",
    "fig, ax = plt.subplots(figsize=(6,4))\n",
    "plot_conf_mat(df_cm_test, classes=['Not Positive Sentiment', 'Positive Sentiment'], \n",
    "                          title='Confusion matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_test, preds_test_0_or_1), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_test, preds_test_0_or_1)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predict in Notebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create `feature_transform_fn()` function (same used during `prepare` phase)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use TruncatedSVD vs. PCA\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "def feature_transform_fn(df_text, column_name, num_components):\n",
    "    text_processors = Pipeline(\n",
    "        steps=[\n",
    "            (\n",
    "                'tfidfvectorizer',\n",
    "                TfidfVectorizer(\n",
    "                    max_df=0.25,                                       \n",
    "                    min_df=.0025,\n",
    "                    analyzer='word',\n",
    "                    max_features=10000\n",
    "                )\n",
    "            )\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    column_transformer = ColumnTransformer(\n",
    "        transformers=[('text_processing', text_processors, df_text.columns.get_loc(column_name))]\n",
    "    )\n",
    "\n",
    "    pipeline = Pipeline(\n",
    "        steps=[\n",
    "            ('column_transformer', column_transformer), \n",
    "            ('dimension_reducer', TruncatedSVD(n_components=num_components)),\n",
    "            ('standard_scaler', StandardScaler())\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    return pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# $S3_BUCKET/feature-store/amazon-reviews/scrubbed-raw-with-header\n",
    "\n",
    "prefix_raw = 'feature-store/amazon-reviews/raw-labeled-split-balanced-header-test/'\n",
    "\n",
    "scrubbed_raw_path = './{}'.format(prefix_raw)\n",
    "\n",
    "import os\n",
    "os.makedirs(prefix_raw, exist_ok=True)\n",
    "\n",
    "scrubbed_raw_s3_uri = 's3://{}/{}'.format(bucket, prefix_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp --recursive $scrubbed_raw_s3_uri $scrubbed_raw_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_raw, y_raw = load_dataset(path=scrubbed_raw_path, sep=',', header=0)\n",
    "X_raw.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_raw.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Transform raw to tfidf (we've already done this, but showing it again for clarity)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np_tfidf = feature_transform_fn(df_text=X_raw, column_name='review_body', num_components=300).fit_transform(X_raw)\n",
    "df_tfidf = pd.DataFrame(np_tfidf)\n",
    "df_tfidf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df_tfidf.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_raw.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_raw.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "preds_raw = xgb_estimator_restored.predict(df_tfidf)\n",
    "df_preds_raw = pd.DataFrame(preds_raw)\n",
    "df_preds_raw.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# TODO:  This isn't needed anymore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "preds_raw_0_or_1 = np.where(preds_raw > 0.5, 1, 0)\n",
    "preds_raw_0_or_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix\n",
    "\n",
    "print('Test Accuracy: ', accuracy_score(y_raw, preds_raw_0_or_1))\n",
    "print('Test Precision: ', precision_score(y_raw, preds_raw_0_or_1, average=None))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sn\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "df_cm_raw = confusion_matrix(y_raw, preds_raw_0_or_1)\n",
    "df_cm_raw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):\n",
    "    print(cm)\n",
    "    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    tick_marks = np.arange(len(classes))\n",
    "    plt.xticks(tick_marks, classes, rotation=45)\n",
    "    plt.yticks(tick_marks, classes)\n",
    "\n",
    "    fmt = 'd'\n",
    "    thresh = cm.max() / 2.\n",
    "    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
    "        plt.text(j, i, format(cm[i, j], fmt),\n",
    "        horizontalalignment=\"center\",\n",
    "        color=\"black\" if cm[i, j] > thresh else \"black\")\n",
    "\n",
    "        plt.tight_layout()\n",
    "        plt.ylabel('True label')\n",
    "        plt.xlabel('Predicted label')\n",
    "\n",
    "# Plot non-normalized confusion matrix\n",
    "plt.figure()\n",
    "fig, ax = plt.subplots(figsize=(6,4))\n",
    "plot_conf_mat(df_cm_raw, classes=['Not Positive Sentiment', 'Positive Sentiment'], \n",
    "                          title='Confusion matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "auc = round(metrics.roc_auc_score(y_raw, preds_raw_0_or_1), 4)\n",
    "print('AUC is ' + repr(auc))\n",
    "\n",
    "fpr, tpr, _ = metrics.roc_curve(y_raw, preds_raw_0_or_1)\n",
    "\n",
    "plt.title('ROC Curve')\n",
    "plt.plot(fpr, tpr, 'b',\n",
    "label='AUC = %0.2f'% auc)\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0,1],[0,1],'r--')\n",
    "plt.xlim([-0.1,1.1])\n",
    "plt.ylim([-0.1,1.1])\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
